Analysis of Differential Gene Expression with DESeq2

Introduction

This presentation introduces a complete RNA-Seq analysis workflow using the DESeq2 package. You will learn how to:

  • Load and quality-filter RNA-Seq count data.
  • Perform differential expression analysis.
  • Explore the data with PCA, MDS, and pairs plots.
  • Visualize results interactively using plotly and heatmaply .
  • Annotate genes with FlyBase information.

The pasilla dataset comes from an experiment investigating the effect of siRNA knockdown of the pasilla gene in Drosophila.

1. Setup and Load Libraries

library(DESeq2)
library(ggplot2)
library(plotly)
library(GGally)
library(matrixStats)

Data Loading and Preparation - Count Matrix

# Load the count matrix and sample annotation.
# Adjust file paths as needed.
countData <- read.delim("../data/deseq/pasilla_gene_counts.tsv", row.names = 1)
head(countData)
            untreated1 untreated2 untreated3 untreated4 treated1 treated2
FBgn0000003          0          0          0          0        0        0
FBgn0000008         92        161         76         70      140       88
FBgn0000014          5          1          0          0        4        0
FBgn0000015          0          2          1          2        1        0
FBgn0000017       4664       8714       3564       3150     6205     3072
FBgn0000018        583        761        245        310      722      299
            treated3
FBgn0000003        1
FBgn0000008       70
FBgn0000014        0
FBgn0000015        0
FBgn0000017     3334
FBgn0000018      308

Sample Annotation

colData <- read.csv("../data/deseq/pasilla_sample_annotation.csv", row.names = 1)
# Ensure the 'condition' variable is a factor.
colData$condition <- as.factor(colData$condition)
head(colData)
           condition        type number.of.lanes total.number.of.reads
treated1     treated single-read               5              35158667
treated2     treated  paired-end               2         12242535 (x2)
treated3     treated  paired-end               2         12443664 (x2)
untreated1 untreated single-read               2              17812866
untreated2 untreated single-read               6              34284521
untreated3 untreated  paired-end               2         10542625 (x2)
           exon.counts
treated1      15679615
treated2      15620018
treated3      12733865
untreated1    14924838
untreated2    20764558
untreated3    10283129

Adjust Data Order

# It is critical that the order of columns in countData matches the rows in colData.
all(rownames(colData) == colnames(countData))  # Should be TRUE
[1] FALSE
colData <- colData[colnames(countData), ]
all(rownames(colData) == colnames(countData))
[1] TRUE
print(colData)
           condition        type number.of.lanes total.number.of.reads
untreated1 untreated single-read               2              17812866
untreated2 untreated single-read               6              34284521
untreated3 untreated  paired-end               2         10542625 (x2)
untreated4 untreated  paired-end               2         12214974 (x2)
treated1     treated single-read               5              35158667
treated2     treated  paired-end               2         12242535 (x2)
treated3     treated  paired-end               2         12443664 (x2)
           exon.counts
untreated1    14924838
untreated2    20764558
untreated3    10283129
untreated4    11653031
treated1      15679615
treated2      15620018
treated3      12733865

3. Create DESeqDataSet and Filter Genes

# Create a DESeqDataSet object using the count data and sample annotation.
dds <- DESeqDataSetFromMatrix(countData = countData,
                              colData = colData,
                              design = ~ condition)

# Add gene names as metadata to the DESeqDataSet.
metadata <- data.frame(gene = rownames(countData))
mcols(dds) <- DataFrame(mcols(dds), metadata)
print(dds)
class: DESeqDataSet 
dim: 14599 7 
metadata(1): version
assays(1): counts
rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574 FBgn0261575
rowData names(1): gene
colnames(7): untreated1 untreated2 ... treated2 treated3
colData names(5): condition type number.of.lanes total.number.of.reads
  exon.counts

Filter Genes with Low Counts

# Filter out genes with low counts (total count < 3) to reduce noise.
keep <- rowSums(counts(dds)) >= 3
dds <- dds[keep, ]
cat("Number of genes after filtering:", nrow(dds), "\n")
Number of genes after filtering: 11199 
# Set 'untreated' as the reference level for condition.
print(levels(dds$condition))
[1] "treated"   "untreated"
dds$condition <- relevel(dds$condition, ref = "untreated")
print(levels(dds$condition))
[1] "untreated" "treated"  

4. Differential Expression Analysis with DESeq2

# Run the full DESeq2 pipeline:
# 1. Normalization, 2. Dispersion estimation, 3. Model fitting, and 4. Hypothesis testing.
dds <- DESeq(dds)

# Extract results with the default FDR threshold (0.1).
res <- results(dds)
summary(res)

out of 11199 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 513, 4.6%
LFC < 0 (down)     : 534, 4.8%
outliers [1]       : 1, 0.0089%
low counts [2]     : 2606, 23%
(mean count < 5)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# Extract results using a stricter FDR threshold (alpha = 0.05).
res05 <- results(dds, alpha = 0.05)
summary(res05)

out of 11199 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 408, 3.6%
LFC < 0 (down)     : 432, 3.9%
outliers [1]       : 1, 0.0089%
low counts [2]     : 2823, 25%
(mean count < 6)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

5. Variance Stabilizing Transformation (VST)

# RNA-Seq count data have wide ranges and variance increases with the mean.
# VST transforms the data so that variance is approximately constant, making it
# better suited for visualizations like PCA and heatmaps.
vsd <- vst(dds, blind = FALSE)
pcaData <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)
pcaData$type <- colData$type  # Add sequencing type for extra context.
percentVar <- round(100 * attr(pcaData, "percentVar"))

PCA: Principal Component Analysis

p_pca <- ggplot(pcaData, aes(PC1, PC2, color = condition, shape = type)) +
  geom_point(size = 3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  ggtitle("PCA of pasilla Samples") +
  theme_minimal()
p_pca

Interactive PCA Plot

# Convert the PCA plot into an interactive plot using plotly.
ggplotly(p_pca)

MDS: Multidimensional Scaling

# Compute a distance matrix from the VST data and perform MDS.
dists <- dist(t(assay(vsd)))
mds <- cmdscale(dists)
mdsData <- data.frame(MDS1 = mds[, 1],
                      MDS2 = mds[, 2],
                      condition = colData$condition,
                      type = colData$type)
# Static MDS plot with ggplot2.
p_mds <- ggplot(mdsData, aes(MDS1, MDS2, color = condition, shape = type)) +
  geom_point(size = 3) +
  ggtitle("MDS Plot of pasilla Samples") +
  theme_minimal()
p_mds

7. Pairs Plot

# Create a pairs plot to explore relationships between samples.
# The transparency (alpha) is set to reduce overplotting.
vsd_df <- as.data.frame(assay(vsd))
pairs(vsd_df, col="#00000010")

MA Plot

# The MA plot shows the log2 fold changes versus the mean expression.
# Significant genes (FDR < 0.05) are highlighted.
plotMA(res05, main = "DESeq2 MA-Plot (FDR < 0.05)", ylim = c(-5, 5))

Volcano Plot

# Create a volcano plot to display the relationship between fold change and significance.
volcanoData <- as.data.frame(res)
volcanoData$gene <- rownames(volcanoData)
p_volcano <- ggplot(volcanoData, aes(x = log2FoldChange, y = -log10(padj), text = gene)) +
  geom_point(alpha = 0.4) +
  ggtitle("Volcano Plot") +
  xlab("Log2 Fold Change") +
  ylab("-Log10 Adjusted P-Value") +
  theme_minimal()

Interactive Volcano Plot

# Make the volcano plot interactive.
ggplotly(p_volcano, tooltip = c("text", "x", "y"))

9. Accounting for Technical Variation

Sometimes technical variables—like sequencing type (single-read vs. paired-end)—can affect your results. To account for this, include the technical variable as a covariate in your design:

# Adjust the 'type' variable to remove extra characters and ensure it's a factor.
colData$type <- as.factor(gsub("-.+","", colData$type))
dds2 <- DESeqDataSetFromMatrix(countData = countData,
                               colData = colData,
                               design = ~ type + condition)
dds2$condition <- relevel(dds2$condition, ref = "untreated")
dds2 <- DESeq(dds2)

# Extract results accounting for technical variation.
res2 <- results(dds2, contrast = c("condition", "untreated", "treated"))
summary(res2)

out of 12359 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 717, 5.8%
LFC < 0 (down)     : 613, 5%
outliers [1]       : 0, 0%
low counts [2]     : 3560, 29%
(mean count < 4)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Static Heatmaps with pheatmap

# Normalize data for heatmap visualization.
ntd2 <- normTransform(dds2)
library(pheatmap)
df <- as.data.frame(colData(dds2)[, c("condition", "type")])
# Heatmap with hierarchical clustering for differentially expressed genes (padj < 0.01).
select <- res2$padj < 0.001
select[is.na(select)] <- FALSE
pheatmap(assay(ntd2)[select, ], cluster_rows = TRUE, show_rownames = FALSE,
         cluster_cols = TRUE, annotation_col = df, scale = "row",
         color = colorRampPalette(c("blue", "white", "red"))(100))

Interactive Heatmap with heatmaply

# Create an interactive heatmap and export it as an HTML file.
library(heatmaply)
library(htmlwidgets)
p <- heatmaply(assay(ntd2)[select, ], scale = "row", k_row = 3, k_col = 3)
saveWidget(p, "../work_dir/heatmaply.html")
p  # Display interactive heatmap

11. Gene Annotation

# Retrieve gene annotations using the org.Dm.eg.db package.
# This provides gene symbols and descriptions based on FlyBase IDs.

library(org.Dm.eg.db)
library(AnnotationDbi)
fly_ids <- rownames(countData)
fly_genes <- select(org.Dm.eg.db, keys = fly_ids, columns = c("SYMBOL", "GENENAME"),
                    keytype = "FLYBASE")

# Merge annotation data with differential expression results.
res2_05 <- results(dds2, alpha = 0.05, contrast = c("condition", "untreated", "treated"))
res2_05_reordered <- res2_05[order(res2_05$padj), ]
res2_05_annotated <- merge(as.data.frame(res2_05_reordered),
                           fly_genes, by.x = "row.names",
                           by.y = "FLYBASE", all.x = TRUE)
res2_05_annotated <- res2_05_annotated[order(res2_05_annotated$padj), ]
write.csv(res2_05_annotated, "../work_dir/deseq2_results_annotated.csv", row.names = FALSE)

Table of Annotated Results

Row.names baseMean log2FoldChange lfcSE stat pvalue padj SYMBOL GENENAME
612 FBgn0003360 4.3e+03 3.1e+00 1.1e-01 2.9e+01 0e+00 0e+00 sesB stress-sensitive B
2624 FBgn0026562 4.4e+04 2.5e+00 8.7e-02 2.8e+01 0e+00 0e+00 SPARC Secreted protein, acidic, cysteine-rich
9831 FBgn0039155 7.3e+02 4.6e+00 1.7e-01 2.8e+01 0e+00 0e+00 Kal1 Kallmann syndrome 1
2366 FBgn0025111 1.5e+03 -2.9e+00 1.0e-01 -2.7e+01 0e+00 0e+00 Ant2 Adenine nucleotide translocase 2
3192 FBgn0029167 3.7e+03 2.2e+00 9.6e-02 2.3e+01 0e+00 0e+00 Hml Hemolectin
6948 FBgn0035085 6.4e+02 2.6e+00 1.3e-01 2.0e+01 0e+00 0e+00 CG3770 uncharacterized protein
10305 FBgn0039827 2.6e+02 4.2e+00 2.2e-01 1.9e+01 0e+00 0e+00 CG1544 uncharacterized protein
6711 FBgn0034736 2.3e+02 3.5e+00 2.1e-01 1.7e+01 0e+00 0e+00 gas gasoline
3411 FBgn0029896 4.9e+02 2.4e+00 1.5e-01 1.7e+01 0e+00 0e+00 CG3168 uncharacterized protein
30 FBgn0000071 3.4e+02 -2.6e+00 1.6e-01 -1.7e+01 0e+00 0e+00 Ama Amalgam
9598 FBgn0038832 3.0e+02 2.5e+00 1.8e-01 1.4e+01 0e+00 0e+00 CG15695 uncharacterized protein
6815 FBgn0034897 8.9e+02 1.9e+00 1.4e-01 1.4e+01 0e+00 0e+00 Sesn Sestrin
10415 FBgn0040091 1.1e+03 1.6e+00 1.2e-01 1.3e+01 0e+00 0e+00 Ugt317A1 UDP-glycosyltransferase family 317 member A1
7020 FBgn0035189 2.2e+02 -2.9e+00 2.2e-01 -1.3e+01 0e+00 0e+00 CG9119 uncharacterized protein
6495 FBgn0034434 1.1e+02 3.7e+00 2.8e-01 1.3e+01 0e+00 0e+00 NA NA
8839 FBgn0037754 2.7e+02 2.3e+00 1.8e-01 1.3e+01 0e+00 0e+00 CG8500 uncharacterized protein
2709 FBgn0027279 3.0e+03 1.2e+00 9.2e-02 1.3e+01 0e+00 0e+00 l(1)G0196 lethal (1) G0196
14579 FBgn0261552 5.1e+03 1.9e+00 1.5e-01 1.3e+01 0e+00 0e+00 ps pasilla
304 FBgn0001226 1.3e+03 -1.7e+00 1.3e-01 -1.3e+01 0e+00 0e+00 Hsp27 Heat shock protein 27
10421 FBgn0040099 9.1e+02 1.7e+00 1.4e-01 1.3e+01 0e+00 0e+00 lectin-28C lectin-28C